import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

# 定义一个函数来预处理单个样本，使用与训练时相同的预处理器
def preprocess_sample(sample_df, preprocessor):
    # 对分类特征进行OneHot编码
    sample_processed = preprocessor.transform(sample_df)

    # 如果训练时使用了StandardScaler，则需要将数值特征缩放到相同的范围
    # 假设scaler是在训练数据上拟合的，并且保存在pipeline的scaler步骤中
    scaler = preprocessor.named_transformers_['scaler']
    sample_scaled = scaler.transform(sample_processed[:,
                                     preprocessor.named_transformers_['preprocessor'].named_transformers_[
                                         'num'].transformer_idx])

    return sample_scaled



# 假设df是您的DataFrame，其中包含特征和目标变量'rating'
df = pd.read_csv('data.csv')  # 使用您自己的数据集路径替换这里

# 假设'rating'是目标变量，即电视剧的收视率
target = 'score_per'

# 分类特征的列名
categorical_features = ['tag', 'creator', 'contributor', 'title', 'bossStatus']

# 数值特征的列名
numerical_features = ['year', 'month', 'hot_score']

# 将所有分类特征转换为字符串类型
for feature in categorical_features:
    df[feature] = df[feature].astype(str)

# 划分特征和目标变量
X = df[categorical_features + numerical_features].fillna(df['hot_score'].mean())
y = df['score_per'].fillna(df['score_per'].mean())

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# 定义预处理器
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ]
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler(with_mean=False)),  # 设置with_mean=False来避免计算均值
    ('regressor', LinearRegression())
])

# 定义要搜索的超参数网格
param_grid = {
    'regressor__fit_intercept': [True, False],
}

# 创建网格搜索对象
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)

# 训练模型并找到最佳参数
grid_search.fit(X_train, y_train)

# 获取最佳模型
best_model = grid_search.best_estimator_

# 使用最佳模型进行预测
y_pred = best_model.predict(X_test)


# 单个
sample = {'tag': '都市;爱情;剧情;生活;当代;内地',
          'creator': '王 欢',
          'contributor': "'江疏影', '杨采钰', '张佳宁', '张慧雯', '李浩菲'",
          'title': '唐人街探案2',
          'year': 2024,
          'month': 3,
          'bossStatus': 'FREE',
          'hot_score': 6008}

# 评估模型
# 预处理样本
sample_df = pd.DataFrame([sample])


# 训练模型
pipeline.fit(X_train, y_train)

# 预测测试集
y_pred = pipeline.predict(X_test)
print(y_pred)

# 评估模型
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse:.2f}')
print(f'R2 Score: {r2:.2f}')